// vim: set syntax=asm :

{{L}}{{label}}:
    mov             rax, [ rdi + 8 ]

{% capture mr_over_8 %}{{ mr | divided_by: 8}}{%endcapture%}
{% capture mr_over_8_min_1 %}{{ mr | divided_by: 8 | minus: 1}}{%endcapture%}

{%capture tmp%}{{to | plus: 1 }}{%endcapture%}

{%capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_8}}{%endcapture%}
{%capture cols_min_1%}{{to | plus: 1| minus:from| divided_by:mr_over_8|minus:1}}{%endcapture%}


{% for right in (0..cols_min_1) %}
    {% if type == "f16" %} 
        pinsrw          xmm{{tmp}}, word ptr [ rax ], 0
        add             rax, 2
        vcvtph2ps      ymm{{tmp}}, xmm{{tmp}}
        vbroadcastss    ymm{{tmp}}, xmm{{tmp}}
    {% else %}
        vbroadcastss    ymm{{tmp}}, dword ptr [ rax ]
        add             rax, 4
    {% endif %}
    {% for down in (0..mr_over_8_min_1) %}
        {%capture acc%}{{mr_over_8|times:right|plus:from|plus:down}}{%endcapture%}
        {% if flipped %}
            {{op}} ymm{{acc}}, ymm{{acc}}, ymm{{tmp}}
        {% else %}
            {{op}} ymm{{acc}}, ymm{{tmp}}, ymm{{acc}}
        {% endif %}
    {% endfor %}
{% endfor %}

    jmp {{L}}non_linear_loop
